# perform a bootstrap-type exercise to get at sampling variation in court cases

include("cenf.jl")

using CSV, StatFiles, FileIO, Random
using Gadfly

# a helper function
function getfirstexceed(number::Float64, cdf::Vector{Float64})
    for i = 1:size(cdf,1)
        (cdf[i] > number) && return i    
    end
end

# File format:
# countrycode upstream downstream fraction delta delta_withtime n_div_gm n_div_f
data = CSV.read("$(root)/data/dataset.csv")

# Initialize
isocodes=zeros(Int64,size(data,1))
indices=zeros(Int64,999)
lhs=zeros(Float64,109,35,35)
z_gm=zeros(Float64,109,35,35)
z_f=zeros(Float64,109,35,35)
δ=zeros(Float64,109,35,35)
δt=zeros(Float64,109,35,35)
downstream = data[:,3]
upstream = data[:,2]
oldisocode = 0
index = 0

for i=1:size(data,1)
    global oldisocode, data, isocodes, indices, lhs, z_gm, z_f, δ, δt, downstream, upstream, index
    if oldisocode!=data[i,1]
        index+=1
    end
    # data format:
    # countrycode upstream downstream fraction delta delta_withtime n_div_gm n_div_f
    # ISO code as a function of the index
    isocodes[index]=data[i,1]
    # indices as a function of the ISO code
    indices[round(Int64,data[i,1])]=index
    # we set all matrix variables so that the downstream sector
    # is the row index (first), and the upstream sector is the
    # column index (second), to be more in line with the paper
    # (notation X_ni/X_n)
    lhs[index,downstream[i],upstream[i]]=data[i,4]
    # truncate δ at 1
    δ[index,downstream[i],upstream[i]] = data[i,5] < 1.0 ? data[i,5] : 1.0
    δt[index,downstream[i],upstream[i]]=data[i,6]
    z_gm[index,downstream[i],upstream[i]]=data[i,7]
    z_f[index,downstream[i],upstream[i]]=data[i,8]
    oldisocode=data[i,1];
end

# run regressions
using FixedEffectModels, DataFrames
using Statistics, ShiftedArrays

data = DataFrame(load("$(root)/data/dataset_bootstrap.csv"))

data[!,:int_gm] = data[!,:delta] .* data[!,:n_div_gm] 
data[!,:int_f] = data[!,:delta] .* data[!,:n_div_f] 
data[!,:countrycode] = categorical(data[!,:countrycode])
data[!,:upstream] = categorical(data[!,:upstream])
data[!,:downstream] = categorical(data[!,:downstream])
data[!,:fraction_demean] = data[!,:fraction] / std(data[!,:fraction])
data

# baseline
#rr1 = reg(data, @formula(fraction ~ int_gm + fe(countrycode)&fe(upstream) + fe(countrycode)&fe(downstream) + fe(downstream)&fe(upstream)))

# data[!,:int_gm_demean] = data[!,:int_gm] / std(data[!,:int_gm])
# data[!,:int_f_demean] = data[!,:int_f] / std(data[!,:int_f])
# rr1 = reg(data, @formula(fraction_demean ~ int_gm_demean + fe(countrycode)&fe(upstream) + fe(countrycode)&fe(downstream) + fe(downstream)&fe(upstream)))
# rr2 = reg(data, @formula(fraction_demean ~ int_f_demean + fe(countrycode)&fe(upstream) + fe(countrycode)&fe(downstream) + fe(downstream)&fe(upstream)))
# rr1.coef[1]/(std(data[!,:fraction]) / std(data[!,:int_gm]))

# resample
sort!(data, [:countrycode, :downstream, :upstream])
cases = data[data[!,:countrycode] .== 8 , :numberOfCases]

# construct CDF to draw from
c = cases ./ (sum(cases))
cdf = cumsum(c)

# draw the z measures
N_RUNS = 300
SIZE_GENERATED = sum(cases)
rng = MersenneTwister(16091985)
cases_gen = zeros(Int64,size(cases,1),N_RUNS)
for run = 1:N_RUNS
    global cases_gen, cdf
    # generate random's
    gen = rand(rng, Float64, SIZE_GENERATED)
    for gindex = 1:size(gen,1)
        cases_gen[getfirstexceed(gen[gindex],cdf),run] += 1
    end
end
coefs = zeros(Float64, N_RUNS)
for run = 1:N_RUNS
    # construct the DF and run the regression 
    data[!, :z_drawn] = repeat(cases_gen[:,run],109)
    data[!, :int_drawn] = (data[!, :z_drawn] ./ data[!, :denom_gm]) .* data[!, :delta]
    data[ismissing.(data[!, :int_drawn]) .| isnan.(data[!, :int_drawn]), :int_drawn] .= 0.0
    data[!, :int_drawn_demean] = data[!,:int_drawn] ./ std(data[!,:int_drawn])
    rr = reg(data, @formula(fraction_demean ~ int_drawn_demean + fe(countrycode)&fe(upstream) + fe(countrycode)&fe(downstream) + fe(downstream)&fe(upstream)))
    println("$run : $(rr.coef[1])")
    coefs[run] = rr.coef[1]
end
# plot
p = Gadfly.plot(x = coefs, Geom.histogram(bincount = 20), Guide.xlabel("Estimated coefficient of interaction term"))
draw(SVG("../output/bootstrap-gm-samesamplesize.svg",15cm, 10cm),p)
run(`cairosvg ../output/bootstrap-gm-samesamplesize.svg -o ../output/bootstrap-gm-samesamplesize.pdf`)

# with f-measure
coefs = zeros(Float64, N_RUNS)
for run = 1:N_RUNS
    # construct the DF and run the regression 
    data[!, :z_drawn] = repeat(cases_gen[:,run],109)
    data[!, :int_drawn] = (data[!, :z_drawn] ./ data[!, :denom_f]) .* data[!, :delta]
    data[ismissing.(data[!, :int_drawn]) .| isnan.(data[!, :int_drawn]), :int_drawn] .= 0.0
    data[!, :int_drawn_demean] = data[!,:int_drawn] ./ std(data[!,:int_drawn])
    rr = reg(data, @formula(fraction_demean ~ int_drawn_demean + fe(countrycode)&fe(upstream) + fe(countrycode)&fe(downstream) + fe(downstream)&fe(upstream)))
    println("$run : $(rr.coef[1])")
    coefs[run] = rr.coef[1]
end
# plot
p = Gadfly.plot(x = coefs, Geom.histogram(bincount = 20), Guide.xlabel("Estimated coefficient of interaction term"))
draw(SVG("../output/bootstrap-f-samesamplesize.svg",15cm, 10cm),p)
run(`cairosvg ../output/bootstrap-f-samesamplesize.svg -o ../output/bootstrap-f-samesamplesize.pdf`)

